Linux Shell 监控磁盘延迟

1 主节点-开启网络延迟监控程序

1.1 使用 mtr 的网络延迟监控脚本(net_latency_check.sh

#!/bin/bash

# ============================
# 基于 mtr 的网络延迟监控脚本
# ============================

TARGET_IP="192.168.1.100"      # 修改为你的目标 IP
THRESHOLD_MS=100               # 延迟报警阈值(单位:ms)
LOG_FILE="/var/log/net_latency.log"
SLEEP_INTERVAL=5               # 检测间隔(秒)
MTR_PATH=$(which mtr)          # 自动寻找 mtr 路径

# 报警函数(可扩展邮件、钉钉等)
alert() {
  local message=$1
  local timestamp=$(date '+%F %T')
  echo "$timestamp [ALERT] $message"
  echo "$timestamp [ALERT] $message" >> "$LOG_FILE"
}

# 检测函数
check_latency() {
  if ! $MTR_PATH -rwzc 1 "$TARGET_IP" > /tmp/mtr_out 2>/dev/null; then
    alert "目标 $TARGET_IP 无法到达!"
    echo "$(date '+%F %T') [ERROR] MTR failed for $TARGET_IP" >> "$LOG_FILE"
    return
  fi

  # 提取最后一跳的 Avg 延迟
  RTT=$(tail -n 1 /tmp/mtr_out | awk '{print $(NF-1)}')

  # 判断是否数字
  if [[ ! $RTT =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
    alert "无法解析延迟信息:$RTT"
    return
  fi

  # 写入日志
  echo "$(date '+%F %T') [INFO] MTR to $TARGET_IP: ${RTT} ms" >> "$LOG_FILE"

  # 判断是否超阈值
  RTT_INT=${RTT%.*}
  if (( RTT_INT > THRESHOLD_MS )); then
    alert "延迟过高:${RTT} ms,阈值为 ${THRESHOLD_MS} ms"
  fi
}

# 主循环
while true; do
  check_latency
  sleep $SLEEP_INTERVAL
done

1.2 脚本说明

1.3 使用步骤

# 安装 mtr(如未安装)
sudo apt install mtr  # Ubuntu/Debian
sudo yum install mtr  # CentOS/RHEL

# 保存脚本
nano mtr_latency_check.sh

# 复制粘贴内容,保存退出
chmod +x mtr_latency_check.sh

# 后台运行
nohup ./mtr_latency_check.sh &

2 备节点-开启磁盘IO监控程序

2.1 创建磁盘IO 监控脚本

#!/bin/bash

# 设置日志文件路径
LOG_FILE="disk_io_log.txt"
INTERVAL=5  # 采样间隔(秒)

# 检查 iostat 是否可用
if ! command -v iostat &> /dev/null; then
    echo "Error: iostat command not found. Please install sysstat package."
    exit 1
fi

# 检查日志文件是否存在,如果不存在则创建
if [ ! -f "$LOG_FILE" ]; then
    touch "$LOG_FILE"
    echo "Created log file: $LOG_FILE"
fi

echo "Starting continuous disk IO monitoring... (Press Ctrl+C to stop)"
echo "Logging to: $LOG_FILE"

# 写入初始信息
echo "Disk IO monitoring started at $(date "+%Y-%m-%d %H:%M:%S")" >> "$LOG_FILE"
echo "----------------------------------------" >> "$LOG_FILE"

# 无限循环监控
while true; do
    TIMESTAMP=$(date "+%Y-%m-%d %H:%M:%S")
    echo "=== Disk IO Statistics - $TIMESTAMP ===" >> "$LOG_FILE"
    # 使用 iostat 获取详细磁盘IO统计信息
    # -d: 显示设备统计
    # -x: 显示扩展统计(包括延迟)
    # -k: 以KB为单位显示
    # 1 1: 采样1次,间隔1秒
    iostat -d -x -k 1 1 >> "$LOG_FILE"
    echo "----------------------------------------" >> "$LOG_FILE"
    echo "" >> "$LOG_FILE"
    sleep "$INTERVAL"
done

2.2 脚本说明

  1. 输入: 读取 disk_io_log.txt(由之前 iostat 脚本生成)。
  2. 输出: 生成 disk_io_analysis.txt,包含:
    • 读写吞吐量(KB/s)
    • 总吞吐量
    • 每秒事务数 (tps)
    • 如果日志包含 -x 参数数据,还会分析:
      • IO 等待时间 (await)
      • 服务时间 (svctm)
      • 队列等待时间
      • 磁盘利用率 (%util)

2.3 创建磁盘IO 分析脚本

#!/bin/bash

# 设置输入和输出文件路径
LOG_FILE="disk_io_log.txt"
OUTPUT_FILE="disk_io_analysis.txt"

# 检查日志文件是否存在
if [ ! -f "$LOG_FILE" ]; then
    echo "Error: Log file $LOG_FILE not found!"
    exit 1
fi

# 检查 bc 是否可用(用于浮点运算)
if ! command -v bc &> /dev/null; then
    echo "Error: bc command not found. Please install bc package."
    exit 1
fi

# 清空或创建分析输出文件
> "$OUTPUT_FILE"
echo "Disk IO Analysis Report - Generated on $(date "+%Y-%m-%d %H:%M:%S")" >> "$OUTPUT_FILE"
echo "----------------------------------------" >> "$OUTPUT_FILE"

# 分析函数
analyze_io() {
    local timestamp="$1"
    local device="$2"
    local tps="$3"
    local kb_read_s="$4"
    local kb_wrtn_s="$5"
    local await="$6"
    local svctm="$7"
    local util="$8"

    # 计算总吞吐量
    total_throughput=$(echo "$kb_read_s + $kb_wrtn_s" | bc)

    # 计算队列等待时间
    if [ -n "$await" ] && [ -n "$svctm" ]; then
        queue_time=$(echo "$await - $svctm" | bc)
    else
        queue_time="N/A"
    fi

    # 写入分析结果
    echo "Timestamp: $timestamp" >> "$OUTPUT_FILE"
    echo "Device: $device" >> "$OUTPUT_FILE"
    echo "----------------------------------------" >> "$OUTPUT_FILE"
    echo "Read Throughput: $kb_read_s KB/s" >> "$OUTPUT_FILE"
    echo "Write Throughput: $kb_wrtn_s KB/s" >> "$OUTPUT_FILE"
    echo "Total Throughput: $total_throughput KB/s" >> "$OUTPUT_FILE"
    echo "Transactions Per Second (tps): $tps" >> "$OUTPUT_FILE"

    # 延迟和利用率分析
    if [ -n "$await" ]; then
        echo "Average IO Wait Time (await): $await ms" >> "$OUTPUT_FILE"
        echo "Service Time (svctm): $svctm ms" >> "$OUTPUT_FILE"
        echo "Queue Wait Time: $queue_time ms" >> "$OUTPUT_FILE"
        echo "Disk Utilization: $util%" >> "$OUTPUT_FILE"

        # 延迟评估
        if (( $(echo "$await > 20" | bc -l) )); then
            echo "Warning: High latency detected (await > 20ms)" >> "$OUTPUT_FILE"
        elif (( $(echo "$await > 10" | bc -l) )); then
            echo "Note: Moderate latency (10ms < await ≤ 20ms)" >> "$OUTPUT_FILE"
        else
            echo "Status: Low latency (await ≤ 10ms)" >> "$OUTPUT_FILE"
        fi
    fi

    if [ -n "$util" ]; then
        if (( $(echo "$util > 90" | bc -l) )); then
            echo "Warning: Disk nearly saturated (%util > 90%)" >> "$OUTPUT_FILE"
        elif (( $(echo "$util > 70" | bc -l) )); then
            echo "Note: High disk utilization (%util > 70%)" >> "$OUTPUT_FILE"
        fi
    fi

    if (( $(echo "$total_throughput > 102400" | bc -l) )); then  # 100 MB/s threshold
        echo "Note: High throughput detected (>100 MB/s)" >> "$OUTPUT_FILE"
    fi

    echo "----------------------------------------" >> "$OUTPUT_FILE"
    echo "" >> "$OUTPUT_FILE"
}

# 解析日志文件
echo "Analyzing $LOG_FILE..."
while IFS= read -r line; do
    # 提取时间戳
    if [[ "$line" =~ ===\ Disk\ IO\ Statistics\ -\ ([0-9]{4}-[0-9]{2}-[0-9]{2}\ [0-9]{2}:[0-9]{2}:[0-9]{2})\ === ]]; then
        timestamp="${BASH_REMATCH[1]}"
    fi

    # 匹配扩展格式(带 -x 参数的 iostat 输出)
    if [[ "$line" =~ ^([a-zA-Z0-9]+)\ +([0-9]+\.[0-9]+)\ +([0-9]+\.[0-9]+)\ +([0-9]+\.[0-9]+)\ +([0-9]+\.[0-9]+)\ +([0-9]+\.[0-9]+)\ +([0-9]+\.[0-9]+)\ +([0-9]+\.[0-9]+)\ +([0-9]+\.[0-9]+)\ +([0-9]+\.[0-9]+)\ +([0-9]+\.[0-9]+)$ ]]; then
        device="${BASH_REMATCH[1]}"
        tps="${BASH_REMATCH[2]}"
        kb_read_s="${BASH_REMATCH[3]}"
        kb_wrtn_s="${BASH_REMATCH[4]}"
        await="${BASH_REMATCH[8]}"  # await 在扩展输出中第8列
        svctm="${BASH_REMATCH[9]}"  # svctm 在第9列
        util="${BASH_REMATCH[11]}"  # %util 在第11列
        analyze_io "$timestamp" "$device" "$tps" "$kb_read_s" "$kb_wrtn_s" "$await" "$svctm" "$util"
    fi
done < "$LOG_FILE"

echo "Analysis complete. Results saved to $OUTPUT_FILE"

2.4 脚本说明